In [1]:
from datetime import datetime, timedelta, date
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division

import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go


pyoff.init_notebook_mode()
/Users/murongcui/opt/anaconda3/lib/python3.7/site-packages/IPython/utils/traitlets.py:5: UserWarning:

IPython.utils.traitlets has moved to a top-level traitlets package.

In [2]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report,confusion_matrix, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans

import missingno as msno
In [3]:
customer = pd.read_csv('data/customer.csv')
newcustomer = pd.read_csv('data/newCustomer.csv')
transaction = pd.read_csv('data/transaction.csv')
In [4]:
transaction['transaction_date'] =  pd.to_datetime(transaction['transaction_date'])
transaction['month_year'] = pd.to_datetime(transaction['transaction_date']).dt.to_period('M')

Metric

In [5]:
margin_month = transaction.groupby('month_year')['margin'].sum().reset_index()
margin_month['month_year'] = margin_month['month_year'].astype(str)
margin_month['monthlygrowth'] = margin_month['margin'].pct_change()
In [6]:
plot_data = [
    go.Scatter(
        x=margin_month['month_year'],
        y=margin_month['margin'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Montly Revenue'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

plot_data = [
    go.Scatter(
        x=margin_month['month_year'],
        y=margin_month['monthlygrowth'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Montly Growth Rate'
    )

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [7]:
customer_month = transaction.groupby('month_year')['customer_id'].nunique().reset_index()
customer_month['month_year'] = customer_month['month_year'].astype(str)

plot_data = [
    go.Bar(
        x=customer_month['month_year'],
        y=customer_month['customer_id'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Monthly Number of Active Customers '
    )

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [8]:
transaction_month = transaction.groupby('month_year').size().to_frame('size').reset_index()
transaction_month['month_year'] = transaction_month['month_year'].astype(str)

plot_data = [
    go.Bar(
        x=transaction_month['month_year'],
        y=transaction_month['size'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Monthly Number of Transaction'
    )

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

RFM Analysis

  • RFM stands for Recency - Frequency - Monetary Value
    • Low Value: Customers whoo are less active than others, not very frequent buyer/visitor and generates very low - zero - maybe negative margin
    • Mid Value: In the middle of everything. Often using our platform (but not as much as our High Values), fairly frequent and generates moderate revenue.
    • High Value: The group we don't want to lose. High Margin, Frequency and low Inactivity.

As the methodology, we need to calculate Recency, Frequency and Monetary Value (Margin) and apply unsupervised machine learning to identity different groups (clusters) for each.

  • Recency: To Calculate recency, we need to find out most recent purchase date of each customer and ses how many days for they are inactive for. After having # of inactive days for each customer, we will apply K-means clustering to assign customers a recency score

  • Frequency: To create frequency cluster, we need to find total number orders for each customer. First calculate this and see how frequency look like in oouor customer database.

  • Monetary Value: Let's see how our customer database loooks like when we cluster them based on revenue. We will calculate revenue foor each customoer, plot a histogram and apply the same clustering method.

In [9]:
transaction.head()
Out[9]:
transaction_id product_id customer_id transaction_date online_order order_status brand product_line product_class product_size list_price standard_cost product_first_sold_date margin month_year
0 1 2 2950 2017-02-25 0.0 Approved Solex Standard medium medium 71.49 53.62 2012-12-02 17.87 2017-02
1 2 3 3120 2017-05-21 1.0 Approved Trek Bicycles Standard medium large 2091.47 388.92 2014-03-03 1702.55 2017-05
2 3 37 402 2017-10-16 0.0 Approved OHM Cycles Standard low medium 1793.43 248.82 1999-07-20 1544.61 2017-10
3 4 88 3135 2017-08-31 0.0 Approved Norco Bicycles Standard medium medium 1198.46 381.10 1998-12-16 817.36 2017-08
4 5 78 787 2017-10-01 1.0 Approved Giant Bicycles Standard medium large 1765.30 709.48 2015-08-10 1055.82 2017-10
In [10]:
def recency_days(x):
    return (transaction['transaction_date'].max() - x.max()).days
    
customer_rfm = transaction\
.groupby('customer_id')\
.agg({'transaction_date': recency_days,
      'transaction_id': 'count',
      'list_price': 'sum',
      'margin': 'sum'})\
.rename(columns = {'transaction_date': 'recency',
                   'transaction_id': 'frequency',
                   'list_price': 'monetary_value',
                   'margin': 'ltv'})\
.reset_index()
In [11]:
# histplot on the rfm
def plot_rfm(col, title):
    plot_data = [
        go.Histogram(
        x = customer_rfm[col])
    ]
    plot_layout = go.Layout(
    title = title
    )
    fig = go.Figure(data = plot_data, layout = plot_layout)
    pyoff.iplot(fig)


# Kmean Elbow Plot
def elbow_method(col):
    sse = {}
    for k in range(1, 10):
        kmeans = KMeans(n_clusters = k, max_iter = 1000).fit(customer_rfm[[col]])
        sse[k] = kmeans.inertia_
    plt.figure();
    plt.plot(list(sse.keys()), list(sse.values()));
    plt.xlabel('Number of Cluster')
    plt.show();
    
# calculate kmean cluster df
def cluster_df(n_clusters, col, cluster_name, ascending):
    kmeans = KMeans(n_clusters = n_clusters)
    kmeans.fit(customer_rfm[[col]])
    customer_rfm[cluster_name] = kmeans.predict(customer_rfm[[col]])
    
    reorder_temp_df = customer_rfm\
    .groupby(cluster_name)[col]\
    .describe()\
    .sort_values(by = 'mean', ascending = ascending)\
    .reset_index()
    
    reorder_temp_df.index.name = cluster_name + '_reorder'
    reorder_temp_df.reset_index(inplace = True)
    
    return reorder_temp_df
    
    
In [12]:
# Recency
plot_rfm('recency', 'Recency');
elbow_method('recency')
recency_cluster_reorder = cluster_df(n_clusters = 4, 
                                     col = 'recency', 
                                     cluster_name = 'recency_cluster', 
                                     ascending = False)

recency_cluster_reorder
Out[12]:
recency_cluster_reorder recency_cluster count mean std min 25% 50% 75% max
0 0 1 157.0 233.133758 39.930243 185.0 199.0 221.0 258.0 353.0
1 1 3 497.0 135.933602 22.608288 103.0 115.0 134.0 154.0 184.0
2 2 2 1065.0 69.361502 16.269633 45.0 55.0 67.0 82.0 102.0
3 3 0 1775.0 19.409577 12.855963 0.0 9.0 18.0 30.0 44.0
In [13]:
# Frequency
plot_rfm('frequency', 'Frequency');
elbow_method('frequency')
frequency_cluster_reorder = cluster_df(n_clusters = 4, 
                                     col = 'frequency', 
                                     cluster_name = 'frequency_cluster', 
                                     ascending = True)

frequency_cluster_reorder
Out[13]:
frequency_cluster_reorder frequency_cluster count mean std min 25% 50% 75% max
0 0 0 1111.0 3.179118 0.880906 1.0 3.0 3.0 4.0 4.0
1 1 2 1170.0 5.486325 0.500027 5.0 5.0 5.0 6.0 6.0
2 2 1 994.0 7.729376 0.783903 7.0 7.0 8.0 8.0 9.0
3 3 3 219.0 10.803653 1.010398 10.0 10.0 10.0 11.0 14.0
In [14]:
# Monetary Value
plot_rfm('monetary_value', 'Monetary Value');
elbow_method('monetary_value')
monetary_value_cluster_reorder = cluster_df(n_clusters = 4, 
                                     col = 'monetary_value', 
                                     cluster_name = 'monetary_value_cluster', 
                                     ascending = True)

monetary_value_cluster_reorder
Out[14]:
monetary_value_cluster_reorder monetary_value_cluster count mean std min 25% 50% 75% max
0 0 2 932.0 2951.590107 1020.756633 60.34 2280.6150 3157.495 3812.0550 4335.00
1 1 0 1314.0 5710.658805 793.547761 4340.25 5027.0675 5698.725 6359.3175 7143.39
2 2 3 910.0 8555.210429 899.068965 7145.67 7782.1025 8454.320 9253.8750 10366.86
3 3 1 338.0 12179.466982 1581.196622 10372.68 10943.1900 11803.550 12913.6100 19071.32
In [15]:
# LTV
plot_rfm('ltv', 'Lifetime Value');
elbow_method('ltv')
ltv_cluster_reorder = cluster_df(n_clusters = 3, 
                                     col = 'ltv', 
                                     cluster_name = 'ltv_cluster', 
                                     ascending = True)

ltv_cluster_reorder
Out[15]:
ltv_cluster_reorder ltv_cluster count mean std min 25% 50% 75% max
0 0 1 1494.0 1574.125626 683.313855 15.08 1062.495 1691.385 2159.2325 2582.78
1 1 0 1435.0 3577.094216 647.758815 2584.56 3014.265 3516.000 4084.8800 4842.52
2 2 2 565.0 6098.070938 1168.650839 4847.05 5218.200 5796.160 6580.5600 11668.95
In [16]:
# At all customers

all_clusters = customer_rfm\
.merge(monetary_value_cluster_reorder[['monetary_value_cluster_reorder', 'monetary_value_cluster']], on = 'monetary_value_cluster')\
.merge(frequency_cluster_reorder[['frequency_cluster_reorder', 'frequency_cluster']], on = 'frequency_cluster')\
.merge(recency_cluster_reorder[['recency_cluster_reorder', 'recency_cluster']], on = 'recency_cluster')\
.merge(ltv_cluster_reorder[['ltv_cluster_reorder', 'ltv_cluster']], on = 'ltv_cluster')

all_clusters = all_clusters[['customer_id', 'recency', 'frequency', 'monetary_value', 'ltv',
       'monetary_value_cluster_reorder', 'frequency_cluster_reorder',
       'recency_cluster_reorder', 'ltv_cluster_reorder']]

all_clusters['overall'] = all_clusters['recency_cluster_reorder'] + all_clusters['frequency_cluster_reorder'] + all_clusters['monetary_value_cluster_reorder']

all_clusters.head()
Out[16]:
customer_id recency frequency monetary_value ltv monetary_value_cluster_reorder frequency_cluster_reorder recency_cluster_reorder ltv_cluster_reorder overall
0 1 7 11 9084.45 3018.09 2 3 3 1 8
1 127 23 10 9949.62 3993.71 2 3 3 1 8
2 235 2 11 9629.61 4685.82 2 3 3 1 8
3 359 8 10 8564.63 2921.76 2 3 3 1 8
4 516 19 10 9021.70 4722.21 2 3 3 1 8
In [17]:
all_clusters.groupby('overall')['recency', 'frequency', 'monetary_value', 'ltv'].mean()
Out[17]:
recency frequency monetary_value ltv
overall
0 241.592233 2.271845 2230.762913 1014.527573
1 151.645933 2.861244 2943.079665 1393.340144
2 95.014706 3.450000 3592.424147 1702.794456
3 62.597328 3.965649 4183.671718 2025.519313
4 55.917603 5.102996 5528.176067 2746.642491
5 40.051993 5.878683 6349.314679 3150.197799
6 41.021739 7.052174 7835.489196 3876.107543
7 27.537736 7.931604 9026.232028 4441.183090
8 24.935644 9.198020 11266.319010 5712.928861
9 18.561983 11.033058 13018.215868 6527.300413
In [18]:
all_clusters['segment'] = 'Low-Value'
all_clusters.loc[all_clusters['overall'] > 2, 'segment'] = 'Mid-Value'
all_clusters.loc[all_clusters['overall'] > 7, 'segment'] = 'High-Value'
In [19]:
plt.figure(figsize = (15, 30))

plt.subplot(4,1,1)
sns.scatterplot(x = 'recency', y = 'frequency', hue = 'segment', data = all_clusters);
plt.subplot(4,1,2)
sns.scatterplot(x = 'recency', y = 'monetary_value', hue = 'segment', data = all_clusters);
plt.subplot(4,1,3)
sns.scatterplot(x = 'frequency', y = 'monetary_value', hue = 'segment', data = all_clusters);
plt.subplot(4,1,4)
sns.scatterplot(x = 'overall', y = 'ltv', hue = 'segment', data = all_clusters);
sns.despine();

Customer Lifetime Value Prediction

LTV prediction with XGBoost Multi-classification

Lifetime Value: Total Gross Revenue - Total Cost

In [201]:
features = customer.merge(all_clusters[['customer_id', 'ltv', 'segment']], 
               on = 'customer_id')[['customer_id','segment', 'gender','past_3_years_bike_related_purchases', 
                                    'wealth_segment', 'owns_car', 'tenure', 'state', 'age_group',
                                    'property_valuation', 'age', 'ltv']]
features_dummy = pd.get_dummies(features, columns = ['wealth_segment', 'state', 'owns_car', 'gender'], drop_first = True)
#c = ['gender', 'past_3_years_bike_related_purchases', 
#     'wealth_segment','owns_car', 'tenure', 'state',
#     'property_valuation', 'age', 'segment']

#d = {'Low-Value': 0, 
#     'Mid-Value': 1,
#     'High-Value': 2}
#df.loc[df.my_channel > 20000, 'my_channel'] = 0


def feature_selection(col):
    features_c = features_dummy[col]
    features_c = features_c.dropna()
    y = features_c.pop('ltv')
    #y = np.where(y == 'Low-Value', 0, np.where(y == 'Mid-Value', 1, 2))
    X = features_c
    return X, y
In [203]:
features.to_csv('data/features.csv', index=False)
In [122]:
features_dummy.columns
Out[122]:
Index(['past_3_years_bike_related_purchases', 'tenure', 'age_group',
       'property_valuation', 'age', 'ltv', 'wealth_segment_High Net Worth',
       'wealth_segment_Mass Customer', 'state_QLD', 'state_VIC',
       'owns_car_Yes', 'gender_Male'],
      dtype='object')
In [61]:
#col = ['past_3_years_bike_related_purchases', 'tenure', 'property_valuation',
#       'age', 'segment', 'wealth_segment_High Net Worth',
#       'wealth_segment_Mass Customer', 'state_QLD', 'state_VIC','owns_car_Yes', 'gender_Male']

col = ['past_3_years_bike_related_purchases', 'tenure', 'property_valuation',
       'age', 'ltv']

X, y = feature_selection(col)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

Modeling

In [123]:
features['age_group'] = features['age_group'].astype(str)
features.head()
Out[123]:
gender past_3_years_bike_related_purchases wealth_segment owns_car tenure state age_group property_valuation age ltv
0 Female 93 Mass Customer Yes 11.0 NSW (59.0, 69.0] 10 67.0 3018.09
1 Male 81 Mass Customer Yes 16.0 NSW (39.0, 49.0] 10 40.0 2226.26
2 Male 33 Mass Customer No 7.0 QLD (49.0, 59.0] 9 59.0 220.57
3 Female 56 Affluent Customer Yes 8.0 NSW (39.0, 49.0] 4 43.0 2394.94
4 Male 35 High Net Worth Yes 13.0 VIC (49.0, 59.0] 9 54.0 3946.55
In [193]:
df = features.groupby('age_group')['ltv'].mean().reset_index()[: -1]
In [196]:
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df, 
             x="age_group", 
            y = 'ltv', color = '#5D6D7E');
ax.patches[-2].set_color('#E9967A')
ax.patches[-1].set_color('#95A5A6')
ax,set_ticks([])
ax.set_title('customer lifetime value by age group')
ax.set_ylabel('Customer Lifetime Value')
ax.set_xlabel('Age Group')
sns.despine();
In [164]:
df = features.groupby('wealth_segment')['ltv'].mean().reset_index()
In [165]:
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df, 
             x = "wealth_segment", 
             y = 'ltv', 
             color = '#5D6D7E');
#ax.patches[-2].set_color('#FF5733')
#ax.patches[-1].set_color('#FFC300')
sns.despine();
In [176]:
df = features.groupby('tenure')['ltv'].mean().reset_index().sort_values('ltv', ascending = False)
In [182]:
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df, 
             x = "tenure", 
             y = 'ltv', 
             color = '#5D6D7E');
#ax.patches[-2].set_color('#FF5733')
#ax.patches[-1].set_color('#FFC300')
sns.despine();
In [187]:
df = features.groupby('property_valuation')['ltv'].mean().reset_index()
df.head()
Out[187]:
property_valuation ltv
0 1 3171.423885
1 2 3337.495000
2 3 3078.948188
3 4 3285.924599
4 5 3116.316837
In [188]:
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df, 
             x = "property_valuation", 
             y = 'ltv', 
             color = '#5D6D7E');
#ax.patches[-2].set_color('#FF5733')
#ax.patches[-1].set_color('#FFC300')
sns.despine();
In [190]:
df = features.groupby('state')['ltv'].mean().reset_index()
df.head()
Out[190]:
state ltv
0 NSW 3103.076554
1 QLD 3151.839098
2 VIC 3160.909205
In [191]:
sns.set_context("poster", font_scale = .5, rc={"grid.linewidth": 0.6})
plt.figure(figsize = (15, 15))
ax = sns.barplot(data=df, 
             x = "state", 
             y = 'ltv', 
             color = '#5D6D7E');
#ax.patches[-2].set_color('#FF5733')
#ax.patches[-1].set_color('#FFC300')
sns.despine();
In [98]:
import matplotlib.pyplot as plt
 
# Data to plot
labels = ['Low CLV','Mid CLV', 'High CLV']
sizes = [1493, 1432, 564]
labels_gender = ['Female','Male','Female','Male','Female','Male']
sizes_gender = [769, 698, 717, 680, 272, 276]


colors = ['#99ff99', '#ffcc99', '#ff6666']
colors_gender = ['#ffb3e6', '#c2c2f0','#ffb3e6','#c2c2f0','#ffb3e6','#c2c2f0']
explode = (0.2,0.2,0.2) 
explode_gender = (0.1,0.1,0.1,0.1,0.1,0.1)
#Plot
plt.pie(sizes, labels=labels, colors=colors, startangle=90,frame=True, explode=explode,radius=3)
plt.pie(sizes_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=2 )
#Draw circle
centre_circle = plt.Circle((0,0),1.5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
 
plt.axis('equal')
plt.tight_layout()
plt.show()
In [119]:
labels = ['Female', 'Male']
sizes = [1758, 1654]
labels_gender = ['Low CLT', 'Mid CLT', 'High CLT', 'Low CLT', 'Mid CLT', 'High CLT']
sizes_gender = [769, 717, 272, 698, 680, 276]


colors_gender = ['#99ff99', '#ffcc99', '#ff6666', '#99ff99', '#ffcc99', '#ff6666']
colors = ['#ffb3e6', '#c2c2f0']
explode_gender = (0.1,0.1, 0.1, 0.1, 0.1, 0.1) 
explode = (0.2, 0.2)
#Plot
plt.pie(sizes, labels=labels, colors=colors, startangle=90,frame=True, explode=explode,radius=3)

plt.pie(sizes_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=2 )

#Draw circle
centre_circle = plt.Circle((0,0),1.5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
 
plt.axis('equal')
plt.tight_layout()
plt.show()
In [110]:
features.gender.value_counts()
Out[110]:
Female    1758
Male      1654
Name: gender, dtype: int64
In [109]:
features.groupby(['gender', 'ltv_cluster_reorder']).size()
Out[109]:
gender  ltv_cluster_reorder
Female  0                      769
        1                      717
        2                      272
Male    0                      698
        1                      680
        2                      276
dtype: int64
In [108]:
plt.figure(figsize = (15, 15))
sns.histplot(data=features, 
             x="tenure", 
             kde=True, 
            hue = 'ltv_cluster_reorder',
            bins = 5);
sns.despine();
In [104]:
 
Out[104]:
Index(['gender', 'past_3_years_bike_related_purchases', 'wealth_segment',
       'owns_car', 'tenure', 'state', 'age_group', 'property_valuation', 'age',
       'ltv_cluster_reorder'],
      dtype='object')
In [ ]:
 
In [ ]:
 
In [ ]: